In [ ]:
##################################################################################################
##   Notebook used for extracting text from html files. Some basic preprocessing tasks 
##   v2.0 Preprocessing Text   
##       - Stopword removal
##       - Stemming/ Lemmatization
##
##   Required packages: os, BeautifulSoup, nltk, ,wordcloud 
##   The following is used from the nltk packages: corpoira/stopwords, SnowballStemmer, WordNetLemmatizer
##
##################################################################################################

In [ ]:
import os 
from bs4 import BeautifulSoup as bs

In [ ]:
# Function created for removing HTML tags from a document 
def clean_html(htmlDoc):
    soup = bs(htmlDoc, 'html.parser') # Parses text so that html tags can be extracted
    for script in soup(["script", "style","title",'[document]', 'head', 'title']):
        script.extract() 
    cleaned=str(soup.get_text(separator=' ').encode('ascii','ignore'))
    return cleaned.strip()

In [ ]:
## Read all the html files and open the first speech
rootDir = 'E:\\NLP Session\\RBIGovernorSpeeches\\'
htmlFiles = [f for f in os.listdir(rootDir) if f.endswith('.html')]

fileName = rootDir + htmlFiles[0] 
cleanedtext = clean_html(open(fileName))

In [ ]:
###########################################################################################################################
##  Second paragraph from the speech
###########################################################################################################################
text1 = 'Over the last few weeks, I have outlined the RBI’s approach to inflation, distressed debt, financial inclusion, banking sector reform, and market reform. Today, I’d like to first discuss why central banking is not as easy as it appears (just raise or cut interest rates!) and why it needs decisions, sometimes unpopular or hard-to-explain ones, to be made under conditions of extreme uncertainty. This will then lead in to my arguments about why we need an independent central bank.'
print text1
########################################################################################################################### ## Removing stopwords using two methods: ## 1. Using nltk corpora of stopwords ## 2. Custom stopwords file ## ###########################################################################################################################

In [ ]:
#from nltk.corpus import stopwords
#stopWords = set(stopwords.words('mystopwords'))

stopWords = [line.replace('\n', '') for line in open('stopwords') ]
stopWords

In [ ]:
## Use a lambda function to lower the text, tokenize it and remove it from the corpus if it belongs to the stopwords set 
##
filter(lambda w: not w in stopWords,text1.lower().split())

In [ ]:
## There are some non- alphanumeric tokens in the text, removing them from the corpus
##
text2 = ''.join(w for w in text1 if (w.isalnum() or w ==' '))
text2

In [ ]:
## Removing stopwords from the text corpus
##
filter(lambda w: not w in stopWords,text2.lower().split())
############################################################################################################################ ## Object Standardization ## Using stemming or lemmatization ############################################################################################################################

In [ ]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

clean_text = filter(lambda w: not w in stopWords,text2.lower().split()) ### << Stop word removal

stemmed_words = [stemmer.stem(word) for word in clean_text] 
stemmed_words

In [ ]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

clean_text = filter(lambda w: not w in stopWords,text2.lower().split())

lemmatized_words = [wordnet_lemmatizer.lemmatize(word) for word in clean_text] 
lemmatized_words

In [ ]:
##################################################################################################
## Drawing a wordcloud using the wordcloud package
##
##################################################################################################

from os import path
from wordcloud import WordCloud
# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt

def drawWordcloud(text):
    # lower max_font_size
    wordcloud = WordCloud(max_font_size=40).generate(text)
    #wc_array = WordCloud.to_array(text)
    plt.figure()
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
    return wordcloud

In [ ]:
fileName = rootDir + htmlFiles[0] 
fileName

In [ ]:
fileName = rootDir + htmlFiles[0] 
text = clean_html(open(fileName))
clean_text = filter(lambda w: not w in stopWords,text.lower().split())
clean_text = ' '.join(clean_text)
clean_text

In [ ]:
drawWordcloud(clean_text)

In [ ]:
## Running the same code for a different speech
##
fileName = rootDir + htmlFiles[4] 
text = clean_html(open(fileName))
clean_text = filter(lambda w: not w in stopWords,text.lower().split())
clean_text = ' '.join(clean_text)
drawWordcloud(clean_text)

In [ ]: